{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-07-27T21:47:06.746550Z",
     "end_time": "2023-07-27T21:47:08.176982Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "downloader, INFO http://base.ustc.edu.cn/data/ASSISTment/2009_skill_builder_data_corrected.zip is saved as data\\2009_skill_builder_data_corrected.zip\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading data\\2009_skill_builder_data_corrected.zip 100.00%: 8.66MB | 8.66MB"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "downloader, INFO data\\2009_skill_builder_data_corrected.zip is unzip to data\\2009_skill_builder_data_corrected\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": "'data\\\\2009_skill_builder_data_corrected'"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from EduData import get_data\n",
    "\n",
    "get_data(\"assistment-2009-2010-skill\", \"data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-07-27T21:47:30.868978Z",
     "end_time": "2023-07-27T21:47:31.198605Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "   order_id  user_id  correct  skill_id\n0  33022537    64525        1       1.0\n1  33022709    64525        1       1.0\n2  35450204    70363        0       1.0\n3  35450295    70363        1       1.0\n4  35450311    70363        0       1.0",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>order_id</th>\n      <th>user_id</th>\n      <th>correct</th>\n      <th>skill_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>33022537</td>\n      <td>64525</td>\n      <td>1</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>33022709</td>\n      <td>64525</td>\n      <td>1</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>35450204</td>\n      <td>70363</td>\n      <td>0</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>35450295</td>\n      <td>70363</td>\n      <td>1</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>35450311</td>\n      <td>70363</td>\n      <td>0</td>\n      <td>1.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import random\n",
    "import pandas as pd\n",
    "import tqdm\n",
    "import numpy as np\n",
    "\n",
    "data = pd.read_csv(\n",
    "    'data/2009_skill_builder_data_corrected/skill_builder_data_corrected.csv',\n",
    "    usecols=['order_id', 'user_id', 'skill_id', 'correct']\n",
    ").dropna(subset=['skill_id'])\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-07-27T21:47:36.649000Z",
     "end_time": "2023-07-27T21:47:36.661683Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of skills: 123\n"
     ]
    }
   ],
   "source": [
    "raw_question = data.skill_id.unique().tolist()\n",
    "num_skill = len(raw_question)\n",
    "\n",
    "# question id from 0 to (num_skill - 1)\n",
    "questions = { p: i for i, p in enumerate(raw_question) }\n",
    "\n",
    "print(\"number of skills: %d\" % num_skill)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "data": {
      "text/plain": "numpy.ndarray"
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(data.user_id.unique())"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2023-07-27T21:48:11.990311Z",
     "end_time": "2023-07-27T21:48:12.003789Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-07-27T21:54:01.081974Z",
     "end_time": "2023-07-27T21:54:01.094184Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "parse student sequence:\t: 100%|██████████| 1/1 [00:00<00:00, 500.75it/s]\n"
     ]
    }
   ],
   "source": [
    "def parse_all_seq(students):\n",
    "    all_sequences = []\n",
    "    for student_id in tqdm.tqdm(students, 'parse student sequence:\\t'):\n",
    "        student_sequence = parse_student_seq(data[data.user_id == student_id])\n",
    "        all_sequences.extend([student_sequence])\n",
    "    return all_sequences\n",
    "\n",
    "\n",
    "def parse_student_seq(student):\n",
    "    seq = student.sort_values('order_id')\n",
    "    q = [questions[q] for q in seq.skill_id.tolist()]\n",
    "    a = seq.correct.tolist()\n",
    "    return q, a\n",
    "\n",
    "\n",
    "# [(question_sequence_0, answer_sequence_0), ..., (question_sequence_n, answer_sequence_n)]\n",
    "sequences = parse_all_seq([64525])"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [],
   "source": [
    "# def train_test_split(data, train_size=.7, shuffle=True):\n",
    "#     if shuffle:\n",
    "#         random.shuffle(data)\n",
    "#     boundary = round(len(data) * train_size)\n",
    "#     return data[: boundary], data[boundary:]\n",
    "\n",
    "\n",
    "test_sequences =sequences"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2023-07-27T21:54:04.134036Z",
     "end_time": "2023-07-27T21:54:04.138553Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-07-27T22:06:19.075933Z",
     "end_time": "2023-07-27T22:06:19.102687Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "write into file: 100%|██████████| 64/64 [00:00<00:00, 3640.89it/s]\n"
     ]
    }
   ],
   "source": [
    "def sequences2tl(sequences, trgpath):\n",
    "    with open(trgpath, 'w', encoding='utf8') as f:\n",
    "        for seq in tqdm.tqdm(sequences, 'write into file: '):\n",
    "            questions, answers = seq\n",
    "            seq_len = len(questions)\n",
    "            f.write(str(seq_len) + '\\n')\n",
    "            f.write(','.join([str(q) for q in questions]) + '\\n')\n",
    "            f.write(','.join([str(a) for a in answers]) + '\\n')\n",
    "\n",
    "\n",
    "sequences2tl(test_sequences*64, 'data/2009_skill_builder_data_corrected/example_test.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "start_time": "2023-07-27T21:54:05.792715Z",
     "end_time": "2023-07-27T21:54:05.802361Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
